import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
train_path = '/home/gaurav/Desktop/Social Cops/socialcops_challenge/socialcops_challenge/land_train.csv'
test_path = '/home/gaurav/Desktop/Social Cops/socialcops_challenge/socialcops_challenge/land_test.csv'
data = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print("The first 5 rows of the training dataset\n")
data.head(5)
print("Basic summary of the training dataset")
data.describe()
data.info() #to check null values
Clearly there are no missing values in the data!
sns.set(style='whitegrid')
fig, axis = plt.subplots(nrows=1, ncols=6, figsize = (25,4))
for i in range(6):
axis[i].set_title("Distribution Plot {}".format('X'+str(i+1)))
sns.distplot(data['X'+str(i+1)], ax = axis[i])
plt.show()
From the distribution plots we can easily infer that X1,X2 and X3 features are very much correlated as their plots are very similar to each other
sns.lmplot('X1','X2', data=data, hue='target',fit_reg=True, size=5)
sns.lmplot('X1','X3', data=data, hue='target',fit_reg=True, size=5)
sns.lmplot('X2','X3', data=data, hue='target',fit_reg=True, size=5)
From the plots above, it can be observed how X1, X2, X3 are correlated and the plot between X2 and X3 show very high positive correlation irrespective of class
sns.set(style='whitegrid')
fig, axis = plt.subplots(nrows=1, ncols=6, figsize = (25,4))
for i in range(6):
axis[i].set_title("Distribution Plot {}".format('I'+str(i+1)))
sns.distplot(data['I'+str(i+1)], ax = axis[i])
plt.show()
Similarly I2, I3 and I4 also seem correlated
sns.lmplot('I1','I3', data=data, hue='target',fit_reg=True, size=5)
sns.lmplot('I1','I4', data=data, hue='target',fit_reg=True, size=5)
sns.lmplot('I3','I4', data=data, hue='target',fit_reg=True, size=5)
Scatter between I1 and I4 show high correlation irrespective of class
sns.pairplot(data, hue="target", size=3)
# Heatmap to see correlation of different feature
f, ax = plt.subplots(figsize=(13,13))
sns.heatmap(data.corr(), annot=True)
plt.show()
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(data.values[:,:-1],data.values[:,-1])
print(model.feature_importances_)
from sklearn.preprocessing import StandardScaler
# Separating out the features
x = data.iloc[:,:].values
# Separating out the target
y = data.loc[:,['target']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca1 = PCA()
x = pca1.fit_transform(x)
pca1.explained_variance_ratio_
from sklearn.preprocessing import StandardScaler
# Separating out the features
x = data.iloc[:,:6].values
# Separating out the target
y = data.loc[:,['target']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca2 = PCA()
x = pca2.fit_transform(x)
pca2.explained_variance_ratio_
from sklearn.preprocessing import StandardScaler
# Separating out the features
x = data.iloc[:,6:-1].values
# Separating out the target
y = data.loc[:,['target']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca3 = PCA()
x = pca3.fit_transform(x)
pca3.explained_variance_ratio_
Explained Variance
The explained variance tells us how much information (variance) can be attributed to each of the principal components.
Clearly the last componnent or feature in this case contains the least variance, therefore we drop it
data = data.drop(labels='I6',axis=1)
Z-Score Approach
Reference : https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(data.iloc[:,:-1]))
print(z)
filtered = data[(z < 3).all(axis=1)]
filtered.shape
data.shape
plt.hist(filtered.iloc[:,-1])
filtered['target'].value_counts()
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split( filtered.iloc[:,:-1],filtered.iloc[:,-1],test_size=0.25, random_state=1)
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)
Using SMOTE - Synthetic Minority OverSampling Method
from imblearn.over_sampling import SMOTE
sampler = SMOTE()
X_train, y_train = sampler.fit_resample(X_train,y_train)
plt.hist(y_train)
One Hot encoding the categorical data
from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train)
y_val = np_utils.to_categorical(y_val)
print(y_train)
print(y_train.shape)
print(y_val.shape)
y_train = y_train[:,1:]
y_val = y_val[:,1:]
print(y_train.shape)
print(y_val.shape)
from sklearn.preprocessing import StandardScaler
normalizer=StandardScaler()
X_train=normalizer.fit_transform(X_train)
X_val = normalizer.transform(X_val)
print(X_train.shape)
print(X_val.shape)
#Importing Keras Libraries
import keras
from keras.layers import Input,Dense,BatchNormalization, Activation, Dropout, Add
from keras.models import Model
from keras.utils.vis_utils import plot_model
from keras import optimizers
epochs=50
number_of_classes=4
batch_size=150
inpt = Input(shape=(11,), name='input')
x1 = Dense(units=20)(inpt)
x1 = BatchNormalization()(x1)
x1 = Activation('relu')(x1)
x2 = Dense(units=20)(x1)
x2 = BatchNormalization()(x2)
x2 = Activation('relu')(x2)
x2 = Dropout(0.4)(x2)
x3 = Dense(units=20)(x2)
x3 = BatchNormalization()(x3)
x3 = Activation('relu')(x3)
x4 = Add()([x1,x3])
x4 = BatchNormalization()(x4)
x4 = Activation('relu')(x4)
x4 = Dropout(0.2)(x4)
out = Dense(units=number_of_classes,activation='softmax')(x4)
model = Model(inputs=inpt, outputs=out)
model.summary()
checkpoint = keras.callbacks.ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
adam = optimizers.Adam(lr=0.01, decay=0.0001)
model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])
output = model.fit(x=X_train,y=y_train,batch_size=batch_size, epochs=epochs,validation_data = (X_val,y_val), callbacks=[checkpoint])
In only 50 epochs we can see that validation loss is less than training loss, hence we can infer that the model is generalizing
accs=output.history['acc']
val_accs=output.history['val_acc']
x_axis=[i+1 for i in range(epochs)]
plt.plot(x_axis,accs)
plt.plot(x_axis,val_accs)
plt.show()
loss=output.history['loss']
val_loss=output.history['val_loss']
x_axis=[i+1 for i in range(epochs)]
plt.plot(x_axis,loss)
plt.plot(x_axis,val_loss)
plt.show()
# Loading the model and Weights of which gave the least validation loss
from keras.models import load_model
model = load_model('/home/gaurav/Desktop/Social Cops/socialcops_challenge/socialcops_challenge/weights.47-0.04.hdf5')
Confusion Matrix
from sklearn.metrics import classification_report
y_val_pred=np.argmax(model.predict(X_val),axis=1)
y_val_pred=y_val_pred+1
y_val_n = np.argmax(y_val,axis=1)
y_val_n+=1
print(classification_report(y_val_n, y_val_pred))
test = test.drop('I6',axis=1)
X_test = np.array(test)
X_test = normalizer.transform(X_test)
X_test.shape
y_pred = np.argmax(model.predict(X_test), axis=1)
print(y_pred)
y_pred = y_pred+1
print(y_pred)
test['target'] = pd.DataFrame(y_pred)
test
temp=pd.read_csv('/home/gaurav/Desktop/Social Cops/socialcops_challenge/socialcops_challenge/land_test.csv')
df=pd.DataFrame(data=test)
df['I6']=temp['I6']
target=df['target']
df=df.drop(['target'],axis=1)
df['target']=target
print(df)
df.to_csv('labelled_land_test.csv',index=False)